Dataset Preprocessing¶

Library loading¶

In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import yaml
import os
import warnings
import socket
import matplotlib as plt


warnings.filterwarnings('ignore')
In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))
scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.22.2 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3
figure.dpi 50
savefig.dpi 500
figure.figsize [10, 10]
axes.facecolor white
dotSize 20
In [3]:
DSname="UpD100_2"
DSDirName="Sample_S20813_259"

Configure paths¶

In [4]:
outdir = "../data/output"
if not os.path.exists(outdir):
   # Create a new directory because it does not exist
   os.makedirs(outdir)
    
if not os.path.exists("../data/output/adatas"):
   # Create a new directory because it does not exist
   os.makedirs("../data/output/adatas")

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
results_file = outdir+'/'+DSname+'.h5ad'
In [5]:
scanpyObj = sc.read_10x_mtx('../data/'+DSDirName+'/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

scanpyObj.obs['dataset'] = DSname
scanpyObj.obs_names = [i + "_" + j for i, j in zip(scanpyObj.obs_names.tolist(), scanpyObj.obs["dataset"].tolist())]
... reading from cache file cache/..-data-Sample_S20813_259-filtered_feature_bc_matrix-matrix.h5ad
In [6]:
scanpyObj.var_names_make_unique()
scanpyObj.obs_names_make_unique()

Importing cellIDs¶

In [7]:
cellID = pd.read_csv('../data/'+DSDirName+'/aggregatedCall/aggregatedCall.tsv', sep ="\t",index_col = 0)
cellID
cellID.index = [i + "_" + DSname for i in cellID.index.tolist()]
In [8]:
cellID.Consensus.unique()
Out[8]:
array(['3391B', 'KOLF', 'MIFF1', 'A15461', 'doublet', 'LowQuality'],
      dtype=object)
In [9]:
scanpyObj.obs['cellID'] = cellID.loc[scanpyObj.obs_names, 'Consensus']
In [10]:
scanpyObj.obs
Out[10]:
dataset cellID
AAACCTGAGAAAGTGG-1_UpD100_2 UpD100_2 3391B
AAACCTGCAAACCCAT-1_UpD100_2 UpD100_2 KOLF
AAACCTGCACGGTAGA-1_UpD100_2 UpD100_2 LowQuality
AAACCTGCAGTGGAGT-1_UpD100_2 UpD100_2 KOLF
AAACCTGCATACCATG-1_UpD100_2 UpD100_2 MIFF1
... ... ...
TTTGTCATCGAGAGCA-1_UpD100_2 UpD100_2 KOLF
TTTGTCATCGCTAGCG-1_UpD100_2 UpD100_2 MIFF1
TTTGTCATCTACTATC-1_UpD100_2 UpD100_2 KOLF
TTTGTCATCTCATTCA-1_UpD100_2 UpD100_2 KOLF
TTTGTCATCTTCATGT-1_UpD100_2 UpD100_2 MIFF1

6088 rows × 2 columns

Configure colors¶

In [11]:
cellID_colors = {}
cellID_newName_colors = {}
cellID_newNames = {}


for line in iPSC_lines_map.keys():
    cellID_colors[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["color"]
    cellID_newName_colors[iPSC_lines_map[line]["newName"]] = iPSC_lines_map[line]["color"]
    cellID_newNames[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["newName"]

scanpyObj.obs["cellID"] = scanpyObj.obs["cellID"].astype("category")
scanpyObj.obs["cellID_newName"] = scanpyObj.obs["cellID"].replace(cellID_newNames, inplace=False).astype("category")
scanpyObj.uns["cellID_colors"] = [cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
scanpyObj.uns["cellID_newName_colors"] = [cellID_newName_colors[line] for line in scanpyObj.obs["cellID_newName"].cat.categories]
In [12]:
[cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
Out[12]:
['#DBB807', '#FFFF00', '#0FB248', '#a8a5a5', '#7B00FF', '#403b3b']

Preprocessing¶

Show those genes that yield the highest fraction of counts in each single cells, across all cells.

In [13]:
sc.pl.highest_expr_genes(scanpyObj, n_top=20 )
normalizing counts per cell
    finished (0:00:00)

Subsetting according to good barcodes¶

In [14]:
goodBarcodes=pd.read_csv(outdir+'/'+DSname+'_filteredCells.tsv', sep="\t")["BCs"]+"_"+DSname
scanpyObj = scanpyObj[goodBarcodes,:]

QC metrices¶

In [15]:
scanpyObj.var['mt'] = scanpyObj.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)

scanpyObj.var['ribo'] = scanpyObj.var_names.str.startswith('RP')  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['ribo'], percent_top=None, log1p=True, inplace=True)
In [16]:
scanpyObj
Out[16]:
AnnData object with n_obs × n_vars = 3930 × 33538
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo'
    uns: 'cellID_colors', 'cellID_newName_colors'
In [17]:
scanpyObj.var
Out[17]:
gene_ids feature_types mt n_cells_by_counts mean_counts log1p_mean_counts pct_dropout_by_counts total_counts log1p_total_counts ribo
MIR1302-2HG ENSG00000243485 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM138A ENSG00000237613 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
OR4F5 ENSG00000186092 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AL627309.1 ENSG00000238009 Gene Expression False 34 0.008906 0.008866 99.134860 35.0 3.583519 False
AL627309.3 ENSG00000239945 Gene Expression False 3 0.000763 0.000763 99.923664 3.0 1.386294 False
... ... ... ... ... ... ... ... ... ... ...
AC233755.2 ENSG00000277856 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC233755.1 ENSG00000275063 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC240274.1 ENSG00000271254 Gene Expression False 482 0.143003 0.133659 87.735369 562.0 6.333280 False
AC213203.1 ENSG00000277475 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM231C ENSG00000268674 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False

33538 rows × 10 columns

In [18]:
scanpyObj.obs
Out[18]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGAGAAAGTGG-1_UpD100_2 UpD100_2 3391B CTL01 2343 7.759614 6344.0 8.755423 149.0 5.010635 2.348676 1301.0 7.171657 20.507566
AAACCTGCAAACCCAT-1_UpD100_2 UpD100_2 KOLF CTL08A 3723 8.222554 13352.0 9.499496 178.0 5.187386 1.333134 4003.0 8.295049 29.980528
AAACCTGCAGTGGAGT-1_UpD100_2 UpD100_2 KOLF CTL08A 4202 8.343554 18905.0 9.847235 363.0 5.897154 1.920127 5884.0 8.680162 31.124041
AAACCTGCATACCATG-1_UpD100_2 UpD100_2 MIFF1 CTL02A 2531 7.836765 4977.0 8.512783 31.0 3.465736 0.622865 389.0 5.966147 7.815953
AAACCTGGTAGCGCTC-1_UpD100_2 UpD100_2 MIFF1 CTL02A 3927 8.275886 12702.0 9.449594 200.0 5.303305 1.574555 2211.0 7.701653 17.406708
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCAGTGGTGTAG-1_UpD100_2 UpD100_2 KOLF CTL08A 1488 7.305860 2980.0 8.000014 20.0 3.044523 0.671141 625.0 6.439351 20.973154
TTTGTCAGTTACGTCA-1_UpD100_2 UpD100_2 KOLF CTL08A 3146 8.054205 11376.0 9.339349 168.0 5.129899 1.476793 3625.0 8.195886 31.865332
TTTGTCATCGCTAGCG-1_UpD100_2 UpD100_2 MIFF1 CTL02A 4968 8.510974 15271.0 9.633777 8.0 2.197225 0.052387 1382.0 7.232010 9.049833
TTTGTCATCTACTATC-1_UpD100_2 UpD100_2 KOLF CTL08A 4573 8.428143 14148.0 9.557399 97.0 4.584968 0.685609 2072.0 7.636752 14.645180
TTTGTCATCTCATTCA-1_UpD100_2 UpD100_2 KOLF CTL08A 3368 8.122371 10504.0 9.259606 248.0 5.517453 2.361005 2112.0 7.655864 20.106625

3930 rows × 13 columns

In [19]:
sc.pl.violin(scanpyObj, ['n_genes_by_counts', 'total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['total_counts_mt','total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [20]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [21]:
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_ribo')
In [22]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [23]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [24]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True)
In [25]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [26]:
scanpyObj.write_h5ad(outdir+'/adatas/'+DSname+'_raw.h5ad')
In [27]:
sc.pp.normalize_total(scanpyObj, exclude_highly_expressed=True, max_fraction=.1)
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['FTH1', 'MALAT1']
    finished (0:00:00)
In [28]:
sc.pp.log1p(scanpyObj)
In [29]:
scanpyObj.raw = scanpyObj

Subset according to JointHVGs and Filtered Barcodes from previous step¶

In [30]:
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated.txt", sep = "\t")["HVG"]

scanpyObj = scanpyObj[:,HVGs]
scanpyObj.var["highly_variable"] = True
In [31]:
#sc.pp.highly_variable_genes(scanpyObj, min_mean=0.0125, max_mean=5, min_disp=0.5)
In [32]:
#scanpyObj = scanpyObj[:, HVG.tolist()]

#Multiplexing = Multiplexing[:, Multiplexing.var.highly_variable]
In [33]:
#sc.pl.highly_variable_genes(scanpyObj)

Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.

In [34]:
sc.pp.regress_out(scanpyObj, ['total_counts',"pct_counts_mt"])
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:00:14)

Scale each gene to unit variance. Clip values exceeding standard deviation 10.

In [35]:
sc.pp.scale(scanpyObj, max_value=20)
In [36]:
scanpyObj.obs
Out[36]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGAGAAAGTGG-1_UpD100_2 UpD100_2 3391B CTL01 2343 7.759614 6344.0 8.755423 149.0 5.010635 2.348676 1301.0 7.171657 20.507566
AAACCTGCAAACCCAT-1_UpD100_2 UpD100_2 KOLF CTL08A 3723 8.222554 13352.0 9.499496 178.0 5.187386 1.333134 4003.0 8.295049 29.980528
AAACCTGCAGTGGAGT-1_UpD100_2 UpD100_2 KOLF CTL08A 4202 8.343554 18905.0 9.847235 363.0 5.897154 1.920127 5884.0 8.680162 31.124041
AAACCTGCATACCATG-1_UpD100_2 UpD100_2 MIFF1 CTL02A 2531 7.836765 4977.0 8.512783 31.0 3.465736 0.622865 389.0 5.966147 7.815953
AAACCTGGTAGCGCTC-1_UpD100_2 UpD100_2 MIFF1 CTL02A 3927 8.275886 12702.0 9.449594 200.0 5.303305 1.574555 2211.0 7.701653 17.406708
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCAGTGGTGTAG-1_UpD100_2 UpD100_2 KOLF CTL08A 1488 7.305860 2980.0 8.000014 20.0 3.044523 0.671141 625.0 6.439351 20.973154
TTTGTCAGTTACGTCA-1_UpD100_2 UpD100_2 KOLF CTL08A 3146 8.054205 11376.0 9.339349 168.0 5.129899 1.476793 3625.0 8.195886 31.865332
TTTGTCATCGCTAGCG-1_UpD100_2 UpD100_2 MIFF1 CTL02A 4968 8.510974 15271.0 9.633777 8.0 2.197225 0.052387 1382.0 7.232010 9.049833
TTTGTCATCTACTATC-1_UpD100_2 UpD100_2 KOLF CTL08A 4573 8.428143 14148.0 9.557399 97.0 4.584968 0.685609 2072.0 7.636752 14.645180
TTTGTCATCTCATTCA-1_UpD100_2 UpD100_2 KOLF CTL08A 3368 8.122371 10504.0 9.259606 248.0 5.517453 2.361005 2112.0 7.655864 20.106625

3930 rows × 13 columns

Principal component analysis¶

In [37]:
sc.tl.pca(scanpyObj, svd_solver='arpack')
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
In [38]:
sc.pl.pca(scanpyObj, color='MKI67')
In [39]:
sc.pl.pca_variance_ratio(scanpyObj, log=True)
In [40]:
scanpyObj
Out[40]:
AnnData object with n_obs × n_vars = 3930 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

Computing the neighborhood graph¶

In [41]:
sc.pp.neighbors(scanpyObj, n_neighbors=10, n_pcs=9)
computing neighbors
    using 'X_pca' with n_pcs = 9
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)

Embedding the neighborhood graph¶

In [42]:
sc.tl.umap(scanpyObj)
scanpyObj
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:03)
Out[42]:
AnnData object with n_obs × n_vars = 3930 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [43]:
sc.pl.umap(scanpyObj, color=['MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [44]:
sc.tl.diffmap(scanpyObj)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.99435955 0.9928631  0.9921437  0.9904389  0.9862388
     0.9845218  0.9815995  0.9752868  0.97394925 0.9696207  0.9632086
     0.9610075  0.95771706 0.95704937]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
In [45]:
sc.tl.dpt(scanpyObj)
WARNING: No root cell found. To compute pseudotime, pass the index or expression vector of a root cell, one of:
    adata.uns['iroot'] = root_cell_index
    adata.var['xroot'] = adata[root_cell_name, :].X
computing Diffusion Pseudotime using n_dcs=10
    finished: added
 (0:00:00)
In [46]:
sc.pl.diffmap(scanpyObj, color=[ 'MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [47]:
scanpyObj.X.max()
Out[47]:
20.0